import pandas as pd
import numpy as np
from cytoolz import *
from tqdm.auto import tqdm
tqdm.pandas()
df = pd.read_json('s3://ling583/review.json.gz', lines=True, storage_options={'anon':True})
df.head()
| ratings | title | text | author | date_stayed | offering_id | num_helpful_votes | date | id | via_mobile | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Truly is "Jewel of the Upper Wets Side"” | Stayed in a king suite for 11 nights and yes i... | {'username': 'Papa_Panda', 'num_cities': 22, '... | December 2012 | 93338 | 0 | 2012-12-17 | 147643103 | False |
| 1 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “My home away from home!” | On every visit to NYC, the Hotel Beacon is the... | {'username': 'Maureen V', 'num_reviews': 2, 'n... | December 2012 | 93338 | 0 | 2012-12-17 | 147639004 | False |
| 2 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Great Stay” | This is a great property in Midtown. We two di... | {'username': 'vuguru', 'num_cities': 12, 'num_... | December 2012 | 1762573 | 0 | 2012-12-18 | 147697954 | False |
| 3 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Modern Convenience” | The Andaz is a nice hotel in a central locatio... | {'username': 'Hotel-Designer', 'num_cities': 5... | August 2012 | 1762573 | 0 | 2012-12-17 | 147625723 | False |
| 4 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Its the best of the Andaz Brand in the US....” | I have stayed at each of the US Andaz properti... | {'username': 'JamesE339', 'num_cities': 34, 'n... | December 2012 | 1762573 | 0 | 2012-12-17 | 147612823 | False |
len(df)
878561
First question: what language(s) are the reviews in?
df[df['text'].str.contains(' der ')]
| ratings | title | text | author | date_stayed | offering_id | num_helpful_votes | date | id | via_mobile | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1107 | {'service': 1.0, 'cleanliness': 4.0, 'overall'... | “BW Plus-Hotel mit guter Lage nach Downtown” | Das Hotel liegt optimal nach Downtown (incl. Z... | {'username': 'ReiseReise1000', 'num_cities': 3... | May 2012 | 239853 | 1 | 2012-06-14 | 131978977 | False |
| 1110 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Gutes Stadthotel” | Wir hatten nur eine Übernachtung in Houston bi... | {'username': 'LizErlangen', 'num_cities': 45, ... | June 2009 | 239853 | 0 | 2010-07-17 | 71286274 | False |
| 1703 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Grandioses Hotel mit der besten Lage direkt a... | Eine bessere Lage wird man in New York City fü... | {'username': 'b00gizm', 'num_reviews': 2, 'num... | December 2011 | 93559 | 0 | 2011-12-30 | 122273065 | False |
| 1843 | {'service': 4.0, 'cleanliness': 4.0, 'overall'... | “Immer wieder” | Dieses ist immer wieder eines meiner Favourite... | {'username': 'shaquelle_la_01', 'num_cities': ... | April 2011 | 1158926 | 0 | 2011-10-08 | 119069086 | False |
| 2089 | {'overall': 4.0} | “Drury Inn, nahe bei Restaurants & Shopping.” | Ein solides Mittelklassenhotel, mit zahlreiche... | {'username': '', 'id': '', 'location': ''} | NaN | 107915 | 0 | 2010-02-08 | 55617990 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 878544 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Ausgezeichnet” | Wir waren zu Cherry Blossom in DC und können d... | {'username': 'xyzwien05', 'num_cities': 25, 'n... | April 2010 | 84093 | 0 | 2010-08-30 | 77508503 | False |
| 878548 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Perfekte Location, modernes & chices Ambiente... | Das Hotel Palomar gehoert zur Kimpton Gruppe, ... | {'username': 'berlinrocks', 'num_cities': 3, '... | July 2010 | 84093 | 0 | 2010-08-02 | 73467940 | False |
| 878549 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Superhotel Superlage” | Anlässlich einer Geschäftsreise waren wir das ... | {'username': 'stellatux', 'num_reviews': 3, 'n... | September 2009 | 84093 | 0 | 2010-08-02 | 73351022 | False |
| 878556 | {'overall': 4.0} | “vil komme igen” | Pænt hotel i et livligt miljø. Store værelser,... | {'username': '', 'id': '', 'location': ''} | NaN | 84093 | 0 | 2008-08-31 | 51441576 | False |
| 878559 | {'cleanliness': 5.0, 'overall': 5.0, 'rooms': ... | “Gerne wieder” | Sehr schön ausgestattetes Hotel in bester Lage... | {'username': '', 'id': '', 'location': ''} | July 2008 | 84093 | 0 | 2008-04-01 | 15564508 | False |
19871 rows × 10 columns
Simple language guesser using pycld2 library
import pycld2
def guess_lang(text):
try:
reliable, _, langs = pycld2.detect(text, isPlainText=True, hintLanguage='en')
if reliable:
return langs[0][0]
except pycld2.error as e:
pass
return np.NaN
df['text'].loc[0]
'Stayed in a king suite for 11 nights and yes it cots us a bit but we were happy with the standard of room, the location and the friendliness of the staff. Our room was on the 20th floor overlooking Broadway and the madhouse of the Fairway Market. Room was quite with no noise evident from the hallway or adjoining rooms. It was great to be able to open windows when we craved fresh rather than heated air. The beds, including the fold out sofa bed, were comfortable and the rooms were cleaned well. Wi-fi access worked like a dream with only one connectivity issue on our first night and this was promptly responded to with a call from the service provider to ensure that all was well. The location close to the 72nd Street subway station is great and the complimentary umbrellas on the drizzly days were greatly appreciated. It is fabulous to have the kitchen with cooking facilities and the access to a whole range of fresh foods directly across the road at Fairway.\nThis is the second time that members of the party have stayed at the Beacon and it will certainly be our hotel of choice for future visits.'
guess_lang(df['text'].loc[0])
'ENGLISH'
df['text'].loc[878559]
'Sehr schön ausgestattetes Hotel in bester Lage zur Innenstadt/Duupont Circle/Georgetown. Sehr gute Anbindung an öffentliche Verkehrsmittel und nahe an guten Restaurants. Aufmerksamer Service und innovatives Management (jeden Tag ein Empfang mit Wein für Gäste des Hotels in der Lobby am späten Nachmittag; morgens Tee und Kaffee/Tageszeitung gratis)'
guess_lang(df['text'].loc[878559])
'GERMAN'
Add 'lang' column to our dataframe
df['lang']=df['text'].progress_apply(guess_lang)
df['lang'].value_counts()
ENGLISH 771175 GERMAN 26879 FRENCH 24588 ITALIAN 19968 SPANISH 14044 Japanese 6009 PORTUGUESE 4436 DUTCH 3210 SWEDISH 2943 DANISH 1616 NORWEGIAN 1304 Chinese 524 RUSSIAN 271 TURKISH 110 POLISH 50 ChineseT 27 NORWEGIAN_N 21 GREEK 20 Korean 17 THAI 15 INDONESIAN 4 OCCITAN 3 CORSICAN 2 GALICIAN 1 HUNGARIAN 1 ARABIC 1 AFAR 1 CATALAN 1 AFRIKAANS 1 Name: lang, dtype: int64
df=df[df['lang']=='ENGLISH']
len(df)
771175
df=df.reset_index()
df.head()
| index | ratings | title | text | author | date_stayed | offering_id | num_helpful_votes | date | id | via_mobile | lang | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Truly is "Jewel of the Upper Wets Side"” | Stayed in a king suite for 11 nights and yes i... | {'username': 'Papa_Panda', 'num_cities': 22, '... | December 2012 | 93338 | 0 | 2012-12-17 | 147643103 | False | ENGLISH |
| 1 | 1 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “My home away from home!” | On every visit to NYC, the Hotel Beacon is the... | {'username': 'Maureen V', 'num_reviews': 2, 'n... | December 2012 | 93338 | 0 | 2012-12-17 | 147639004 | False | ENGLISH |
| 2 | 2 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Great Stay” | This is a great property in Midtown. We two di... | {'username': 'vuguru', 'num_cities': 12, 'num_... | December 2012 | 1762573 | 0 | 2012-12-18 | 147697954 | False | ENGLISH |
| 3 | 3 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Modern Convenience” | The Andaz is a nice hotel in a central locatio... | {'username': 'Hotel-Designer', 'num_cities': 5... | August 2012 | 1762573 | 0 | 2012-12-17 | 147625723 | False | ENGLISH |
| 4 | 4 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Its the best of the Andaz Brand in the US....” | I have stayed at each of the US Andaz properti... | {'username': 'JamesE339', 'num_cities': 34, 'n... | December 2012 | 1762573 | 0 | 2012-12-17 | 147612823 | False | ENGLISH |
The values for the 'ratings' column are dictionaries. We need to add these as separate columns using pandas.json_normalize
df['ratings'].head()
0 {'service': 5.0, 'cleanliness': 5.0, 'overall'...
1 {'service': 5.0, 'cleanliness': 5.0, 'overall'...
2 {'service': 4.0, 'cleanliness': 5.0, 'overall'...
3 {'service': 5.0, 'cleanliness': 5.0, 'overall'...
4 {'service': 4.0, 'cleanliness': 5.0, 'overall'...
Name: ratings, dtype: object
pd.json_normalize(df['ratings'].head())
| service | cleanliness | overall | value | location | sleep_quality | rooms | |
|---|---|---|---|---|---|---|---|
| 0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 1 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 2 | 4.0 | 5.0 | 4.0 | 4.0 | 5.0 | 4.0 | 4.0 |
| 3 | 5.0 | 5.0 | 4.0 | 5.0 | 5.0 | 5.0 | 5.0 |
| 4 | 4.0 | 5.0 | 4.0 | 3.0 | 5.0 | 5.0 | 5.0 |
df=pd.concat([df, pd.json_normalize(df['ratings'])], axis=1)
Now the database is ready to work with!
df.head()
| index | ratings | title | text | author | date_stayed | offering_id | num_helpful_votes | date | id | ... | lang | service | cleanliness | overall | value | location | sleep_quality | rooms | check_in_front_desk | business_service_(e_g_internet_access) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Truly is "Jewel of the Upper Wets Side"” | Stayed in a king suite for 11 nights and yes i... | {'username': 'Papa_Panda', 'num_cities': 22, '... | December 2012 | 93338 | 0 | 2012-12-17 | 147643103 | ... | ENGLISH | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN |
| 1 | 1 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “My home away from home!” | On every visit to NYC, the Hotel Beacon is the... | {'username': 'Maureen V', 'num_reviews': 2, 'n... | December 2012 | 93338 | 0 | 2012-12-17 | 147639004 | ... | ENGLISH | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN |
| 2 | 2 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Great Stay” | This is a great property in Midtown. We two di... | {'username': 'vuguru', 'num_cities': 12, 'num_... | December 2012 | 1762573 | 0 | 2012-12-18 | 147697954 | ... | ENGLISH | 4.0 | 5.0 | 4.0 | 4.0 | 5.0 | 4.0 | 4.0 | NaN | NaN |
| 3 | 3 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Modern Convenience” | The Andaz is a nice hotel in a central locatio... | {'username': 'Hotel-Designer', 'num_cities': 5... | August 2012 | 1762573 | 0 | 2012-12-17 | 147625723 | ... | ENGLISH | 5.0 | 5.0 | 4.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN |
| 4 | 4 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Its the best of the Andaz Brand in the US....” | I have stayed at each of the US Andaz properti... | {'username': 'JamesE339', 'num_cities': 34, 'n... | December 2012 | 1762573 | 0 | 2012-12-17 | 147612823 | ... | ENGLISH | 4.0 | 5.0 | 4.0 | 3.0 | 5.0 | 5.0 | 5.0 | NaN | NaN |
5 rows × 21 columns
Load datafile
import pandas as pd
import numpy as np
from cytoolz import *
from tqdm.auto import tqdm
tqdm.pandas()
df = pd.read_json('s3://ling583/review.json.gz', lines=True, storage_options={'anon':True})
Simple language guesser using pycld2 library
import pycld2
def guess_lang(text):
try:
reliable, _, langs = pycld2.detect(text, isPlainText=True, hintLanguage='en')
if reliable:
return langs[0][0]
except pycld2.error as e:
pass
return np.NaN
Add 'lang' column to our dataframe
df['lang'] = df['text'].progress_apply(guess_lang)
Select just the reviews in English
df = df[df['lang']=='ENGLISH'].reset_index()
len(df)
771175
The values for the 'ratings' column are dictionaries. We need to add these as separate columns using pandas.json_normalize
df = pd.concat([df, pd.json_normalize(df['ratings'])], axis=1)
Now the database is ready to work with!
df.head()
| index | ratings | title | text | author | date_stayed | offering_id | num_helpful_votes | date | id | ... | lang | service | cleanliness | overall | value | location | sleep_quality | rooms | check_in_front_desk | business_service_(e_g_internet_access) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Truly is "Jewel of the Upper Wets Side"” | Stayed in a king suite for 11 nights and yes i... | {'username': 'Papa_Panda', 'num_cities': 22, '... | December 2012 | 93338 | 0 | 2012-12-17 | 147643103 | ... | ENGLISH | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN |
| 1 | 1 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “My home away from home!” | On every visit to NYC, the Hotel Beacon is the... | {'username': 'Maureen V', 'num_reviews': 2, 'n... | December 2012 | 93338 | 0 | 2012-12-17 | 147639004 | ... | ENGLISH | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN |
| 2 | 2 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Great Stay” | This is a great property in Midtown. We two di... | {'username': 'vuguru', 'num_cities': 12, 'num_... | December 2012 | 1762573 | 0 | 2012-12-18 | 147697954 | ... | ENGLISH | 4.0 | 5.0 | 4.0 | 4.0 | 5.0 | 4.0 | 4.0 | NaN | NaN |
| 3 | 3 | {'service': 5.0, 'cleanliness': 5.0, 'overall'... | “Modern Convenience” | The Andaz is a nice hotel in a central locatio... | {'username': 'Hotel-Designer', 'num_cities': 5... | August 2012 | 1762573 | 0 | 2012-12-17 | 147625723 | ... | ENGLISH | 5.0 | 5.0 | 4.0 | 5.0 | 5.0 | 5.0 | 5.0 | NaN | NaN |
| 4 | 4 | {'service': 4.0, 'cleanliness': 5.0, 'overall'... | “Its the best of the Andaz Brand in the US....” | I have stayed at each of the US Andaz properti... | {'username': 'JamesE339', 'num_cities': 34, 'n... | December 2012 | 1762573 | 0 | 2012-12-17 | 147612823 | ... | ENGLISH | 4.0 | 5.0 | 4.0 | 3.0 | 5.0 | 5.0 | 5.0 | NaN | NaN |
5 rows × 21 columns
df['overall'].value_counts()
5.0 316641 4.0 247575 3.0 103742 2.0 53401 1.0 49815 0.0 1 Name: overall, dtype: int64
df[df['overall']==0]
| index | ratings | title | text | author | date_stayed | offering_id | num_helpful_votes | date | id | ... | lang | service | cleanliness | overall | value | location | sleep_quality | rooms | check_in_front_desk | business_service_(e_g_internet_access) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 526911 | 618026 | {'overall': 0.0} | “Perfect Location, large room - stayed for two... | Best location. Right where Pier 39 is. Lots of... | {'username': 'Christine M', 'num_cities': 4, '... | NaN | 81222 | 1 | 2012-02-28 | 125344661 | ... | ENGLISH | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
1 rows × 21 columns
df = df[df['overall']>0]
len(df)
771174
import re
from collections import Counter
def tokenize(text):
return re.sub(r'[^a-z]+', ' ', text.lower()).split()
tokenize(df['text'].loc[0])
['stayed', 'in', 'a', 'king', 'suite', 'for', 'nights', 'and', 'yes', 'it', 'cots', 'us', 'a', 'bit', 'but', 'we', 'were', 'happy', 'with', 'the', 'standard', 'of', 'room', 'the', 'location', 'and', 'the', 'friendliness', 'of', 'the', 'staff', 'our', 'room', 'was', 'on', 'the', 'th', 'floor', 'overlooking', 'broadway', 'and', 'the', 'madhouse', 'of', 'the', 'fairway', 'market', 'room', 'was', 'quite', 'with', 'no', 'noise', 'evident', 'from', 'the', 'hallway', 'or', 'adjoining', 'rooms', 'it', 'was', 'great', 'to', 'be', 'able', 'to', 'open', 'windows', 'when', 'we', 'craved', 'fresh', 'rather', 'than', 'heated', 'air', 'the', 'beds', 'including', 'the', 'fold', 'out', 'sofa', 'bed', 'were', 'comfortable', 'and', 'the', 'rooms', 'were', 'cleaned', 'well', 'wi', 'fi', 'access', 'worked', 'like', 'a', 'dream', 'with', 'only', 'one', 'connectivity', 'issue', 'on', 'our', 'first', 'night', 'and', 'this', 'was', 'promptly', 'responded', 'to', 'with', 'a', 'call', 'from', 'the', 'service', 'provider', 'to', 'ensure', 'that', 'all', 'was', 'well', 'the', 'location', 'close', 'to', 'the', 'nd', 'street', 'subway', 'station', 'is', 'great', 'and', 'the', 'complimentary', 'umbrellas', 'on', 'the', 'drizzly', 'days', 'were', 'greatly', 'appreciated', 'it', 'is', 'fabulous', 'to', 'have', 'the', 'kitchen', 'with', 'cooking', 'facilities', 'and', 'the', 'access', 'to', 'a', 'whole', 'range', 'of', 'fresh', 'foods', 'directly', 'across', 'the', 'road', 'at', 'fairway', 'this', 'is', 'the', 'second', 'time', 'that', 'members', 'of', 'the', 'party', 'have', 'stayed', 'at', 'the', 'beacon', 'and', 'it', 'will', 'certainly', 'be', 'our', 'hotel', 'of', 'choice', 'for', 'future', 'visits']
def count(texts):
return Counter(concat(map(tokenize, tqdm(texts))))
pd.Series(count(df['text'][:10]))
stayed 7
in 33
a 49
king 1
suite 10
..
ave 1
cool 1
pic 1
lucky 1
satisfied 1
Length: 648, dtype: int64
f = pd.DataFrame()
f['all'] = pd.Series(count(df['text']))
f.head()
| all | |
|---|---|
| stayed | 332634 |
| in | 1990577 |
| a | 3466481 |
| king | 52407 |
| suite | 78901 |
f['all'].sort_values(ascending=False)
the 7835500
and 4215505
a 3466481
to 3066720
was 2605620
...
attory 1
quarrell 1
supermarkts 1
chipoltees 1
humidiifer 1
Name: all, Length: 194454, dtype: int64
f['bad'] = pd.Series(count(df[df['overall']==1.0]['text']))
f.head()
| all | bad | |
|---|---|---|
| stayed | 332634 | 19737.0 |
| in | 1990577 | 164529.0 |
| a | 3466481 | 237214.0 |
| king | 52407 | 2974.0 |
| suite | 78901 | 3774.0 |
import llr
from math import log2
N_all = f['all'].sum()
N_bad = f['bad'].sum()
N_all, N_bad
(119584444, 9821109.0)
def G2(f12, f1, f2, N):
return llr.llr_root(f12, f1-f12, f2-f12, N-f1-f2+f12)
def pmi(f12, f1, f2, N):
return log2(f12) + log2(N) - log2(f1) - log2(f2)
f['bad_G2'] = f.progress_apply(lambda r: G2(r['bad'], r['all'], N_bad, N_all), axis=1)
f['bad_pmi'] = f.progress_apply(lambda r: pmi(r['bad'], r['all'], N_bad, N_all), axis=1)
f.sort_values(by='bad_pmi', ascending=False).head()
| all | bad | bad_G2 | bad_pmi | |
|---|---|---|---|---|
| unemcumbered | 1 | 1.0 | 2.235839 | 3.606 |
| decription | 2 | 2.0 | 3.161954 | 3.606 |
| eavedropping | 1 | 1.0 | 2.235839 | 3.606 |
| ians | 1 | 1.0 | 2.235839 | 3.606 |
| fisting | 1 | 1.0 | 2.235839 | 3.606 |
f[f['all']>500].sort_values(by='bad_pmi', ascending=False).head()
| all | bad | bad_G2 | bad_pmi | |
|---|---|---|---|---|
| filth | 561 | 384.0 | 35.361845 | 3.059106 |
| infested | 654 | 447.0 | 38.125672 | 3.056984 |
| refund | 5287 | 3408.0 | 102.365867 | 2.972476 |
| dump | 3099 | 1911.0 | 75.040519 | 2.908525 |
| filthy | 6223 | 3830.0 | 106.138496 | 2.905734 |
f.sort_values(by='bad_G2', ascending=False).head()
| all | bad | bad_G2 | bad_pmi | |
|---|---|---|---|---|
| told | 79798 | 22745.0 | 167.713459 | 1.795197 |
| i | 2179220 | 247532.0 | 162.306308 | 0.467875 |
| dirty | 28117 | 12111.0 | 157.668814 | 2.390875 |
| they | 543213 | 77111.0 | 147.247801 | 0.789491 |
| he | 105469 | 23554.0 | 140.655548 | 1.443225 |
import matplotlib.pyplot as plt
from wordcloud import WordCloud
def cloud(freqs, k=200):
plt.figure(figsize=(10,10))
items = dict(freqs.sort_values(ascending=False).head(k))
wc = WordCloud(width=1000,
height=1000,
background_color='white').generate_from_frequencies(items)
plt.axis('off')
plt.imshow(wc, interpolation='bilinear')
plt.show()
cloud(f['bad'])
cloud(f[f['all']>500]['bad_pmi'])
cloud(f['bad_G2'])
from kwic import kwic
kwic('told', df[df['overall']==1.0]['text'])
| 29 | vailable and the person I had spoken to was not on duty at the time. I was | told | to check again later and I responded that if no other room was available I |
| 29 | ew minutes, went back down and said I would check out. At that point I was | told | that since it was by then past noon I was to pay for the room anyway! Upon |
| 75 | ooking we were advised that ALL ROOMS have a bathtub - "no sorry, you were | told | wrong, and there's nothing I can do but you are welcome to leave". Then my |
| 228 | morning. He/she was not in so we talked to the front desk person. When we | told | her about the condom wrapper she said "eeeeggghhhhhh" and said that "she h |
| 228 | and said that "she has no idea what is up with the cleaning staff lately" | told | us that she would tell the manager. We never heard anything from the manag |
| 376 | leave early because of illness. Even though we went to the front desk and | told | them we were leaving the staff didn’t check us out of our room. A few week |
| 376 | er we called SPG to get the points for our unused nights refunded and were | told | we couldn’t because the hotel was claiming we stayed the entire 4 nights. |
| 420 | lax in the room. When I called the front desk and asked to be moved, I was | told | that as I had booked thru Priceline, a move would be considered an up char |
| 555 | or my wedding in Houston. The guy said yes to our weekend and then when we | told | him there was an event in Houston he said that he would have to honor the |
| 669 | he tiles and mold in the air filters. I contacted management again and was | told | there was nothing they could do about the noise!! and offered me free dinn |
df.loc[669]
index 670
ratings {'service': 1.0, 'cleanliness': 2.0, 'overall'...
title “SORRY excuse for the CONRAD HILTON name”
text DATED ROOMS: SCARY PARKING: POOR ROOM SERVICE;...
author {'username': 'MrsJamesMcClure', 'num_reviews':...
date_stayed August 2008
offering_id 108038
num_helpful_votes 0
date 2008-08-07 00:00:00
id 18720311
via_mobile False
lang ENGLISH
service 1.0
cleanliness 2.0
overall 1.0
value 2.0
location 1.0
sleep_quality NaN
rooms 1.0
check_in_front_desk 2.0
business_service_(e_g_internet_access) NaN
Name: 669, dtype: object
f['good'] = pd.Series(count(df[df['overall']==4.0]['text']))
N_good = f['good'].sum()
f['good_G2'] = f.progress_apply(lambda r: G2(r['good'], r['all'], N_good, N_all), axis=1)
f['good_pmi'] = f.progress_apply(lambda r: pmi(r['good'], r['all'], N_good, N_all), axis=1)
cloud(f['good'])
cloud(f[f['all']>500]['good_pmi'])
cloud(f['good_G2'])
f['great'] = pd.Series(count(df[df['overall']==5.0]['text']))
N_great = f['great'].sum()
f['great_G2'] = f.progress_apply(lambda r: G2(r['great'], r['all'], N_great, N_all), axis=1)
f['great_pmi'] = f.progress_apply(lambda r: pmi(r['great'], r['all'], N_great, N_all), axis=1)
cloud(f['great'])
cloud(f[f['all']>500]['great_pmi'])
cloud(f['great_G2'])